{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "\n",
    "\n",
    "This notebook accompanies the [workshop paper](http://jiahao.github.io/parallel-prefix/) by Jiahao Chen and Alan Edelman entitled \"Parallel Prefix Polymorphism Permits Parallelization, Presentation & Proof\" and will appear in the proceedings of the [First Workshop for High Performance Technical Computing in Dynamic Languages](http://jiahao.github.io/hptcdl-sc14/), held in conjunction with [SC14: The International Conference on High Performance Computing, Networking, Storage and Analysis](http://sc14.supercomputing.org/).\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[32m\u001b[1m  Updating\u001b[22m\u001b[39m registry at `~/.julia/registries/General`\n",
      "\u001b[32m\u001b[1m  Updating\u001b[22m\u001b[39m git-repo `https://github.com/JuliaRegistries/General.git`\n",
      "\u001b[?25l\u001b[2K\u001b[?25h\u001b[32m\u001b[1m Resolving\u001b[22m\u001b[39m package versions...\n",
      "\u001b[32m\u001b[1m Installed\u001b[22m\u001b[39m Media ────────────────── v0.4.1\n",
      "\u001b[32m\u001b[1m Installed\u001b[22m\u001b[39m Interact ─────────────── v0.9.0\n",
      "\u001b[32m\u001b[1m Installed\u001b[22m\u001b[39m InteractBulma ────────── v0.4.2\n",
      "\u001b[32m\u001b[1m Installed\u001b[22m\u001b[39m NLSolversBase ────────── v7.1.0\n",
      "\u001b[32m\u001b[1m Installed\u001b[22m\u001b[39m Knockout ─────────────── v0.2.0\n",
      "\u001b[32m\u001b[1m Installed\u001b[22m\u001b[39m DataArrays ───────────── v0.7.0\n",
      "\u001b[32m\u001b[1m Installed\u001b[22m\u001b[39m Gadfly ───────────────── v0.8.0\n",
      "\u001b[32m\u001b[1m Installed\u001b[22m\u001b[39m Compose ──────────────── v0.6.1\n",
      "\u001b[32m\u001b[1m Installed\u001b[22m\u001b[39m Loess ────────────────── v0.5.0\n",
      "\u001b[32m\u001b[1m Installed\u001b[22m\u001b[39m KernelDensity ────────── v0.5.0\n",
      "\u001b[32m\u001b[1m Installed\u001b[22m\u001b[39m CSSUtil ──────────────── v0.1.0\n",
      "\u001b[32m\u001b[1m Installed\u001b[22m\u001b[39m PDMats ───────────────── v0.9.5\n",
      "\u001b[32m\u001b[1m Installed\u001b[22m\u001b[39m Parameters ───────────── v0.10.1\n",
      "\u001b[32m\u001b[1m Installed\u001b[22m\u001b[39m CoupledFields ────────── v0.1.0\n",
      "\u001b[32m\u001b[1m Installed\u001b[22m\u001b[39m Optim ────────────────── v0.17.0\n",
      "\u001b[32m\u001b[1m Installed\u001b[22m\u001b[39m Arpack ───────────────── v0.2.3\n",
      "\u001b[32m\u001b[1m Installed\u001b[22m\u001b[39m Juno ─────────────────── v0.5.3\n",
      "\u001b[32m\u001b[1m Installed\u001b[22m\u001b[39m PositiveFactorizations ─ v0.2.1\n",
      "\u001b[32m\u001b[1m Installed\u001b[22m\u001b[39m InteractBase ─────────── v0.8.1\n",
      "\u001b[32m\u001b[1m Installed\u001b[22m\u001b[39m Hexagons ─────────────── v0.1.0\n",
      "\u001b[32m\u001b[1m Installed\u001b[22m\u001b[39m Distributions ────────── v0.16.3\n",
      "\u001b[32m\u001b[1m Installed\u001b[22m\u001b[39m QuadGK ───────────────── v2.0.2\n",
      "\u001b[32m\u001b[1m  Updating\u001b[22m\u001b[39m `~/.julia/environments/v1.0/Project.toml`\n",
      " \u001b[90m [a81c6b42]\u001b[39m\u001b[92m + Compose v0.6.1\u001b[39m\n",
      " \u001b[90m [c91e804a]\u001b[39m\u001b[92m + Gadfly v0.8.0\u001b[39m\n",
      " \u001b[90m [c601a237]\u001b[39m\u001b[92m + Interact v0.9.0\u001b[39m\n",
      "\u001b[32m\u001b[1m  Updating\u001b[22m\u001b[39m `~/.julia/environments/v1.0/Manifest.toml`\n",
      " \u001b[90m [7d9fca2a]\u001b[39m\u001b[92m + Arpack v0.2.3\u001b[39m\n",
      " \u001b[90m [70588ee8]\u001b[39m\u001b[92m + CSSUtil v0.1.0\u001b[39m\n",
      " \u001b[90m [49dc2e85]\u001b[39m\u001b[92m + Calculus v0.4.1\u001b[39m\n",
      " \u001b[90m [324d7699]\u001b[39m\u001b[92m + CategoricalArrays v0.3.13\u001b[39m\n",
      " \u001b[90m [a81c6b42]\u001b[39m\u001b[92m + Compose v0.6.1\u001b[39m\n",
      " \u001b[90m [7ad07ef1]\u001b[39m\u001b[92m + CoupledFields v0.1.0\u001b[39m\n",
      " \u001b[90m [0fe7c1db]\u001b[39m\u001b[92m + DataArrays v0.7.0\u001b[39m\n",
      " \u001b[90m [a93c6f00]\u001b[39m\u001b[92m + DataFrames v0.13.1\u001b[39m\n",
      " \u001b[90m [9a8bc11e]\u001b[39m\u001b[92m + DataStreams v0.4.0\u001b[39m\n",
      " \u001b[90m [01453d9d]\u001b[39m\u001b[92m + DiffEqDiffTools v0.6.0\u001b[39m\n",
      " \u001b[90m [31c24e10]\u001b[39m\u001b[92m + Distributions v0.16.3\u001b[39m\n",
      " \u001b[90m [c91e804a]\u001b[39m\u001b[92m + Gadfly v0.8.0\u001b[39m\n",
      " \u001b[90m [a1b4810d]\u001b[39m\u001b[92m + Hexagons v0.1.0\u001b[39m\n",
      " \u001b[90m [c601a237]\u001b[39m\u001b[92m + Interact v0.9.0\u001b[39m\n",
      " \u001b[90m [d3863d7c]\u001b[39m\u001b[92m + InteractBase v0.8.1\u001b[39m\n",
      " \u001b[90m [7981ab7d]\u001b[39m\u001b[92m + InteractBulma v0.4.2\u001b[39m\n",
      " \u001b[90m [e5e0dc1b]\u001b[39m\u001b[92m + Juno v0.5.3\u001b[39m\n",
      " \u001b[90m [5ab0869b]\u001b[39m\u001b[92m + KernelDensity v0.5.0\u001b[39m\n",
      " \u001b[90m [bcebb21b]\u001b[39m\u001b[92m + Knockout v0.2.0\u001b[39m\n",
      " \u001b[90m [d3d80556]\u001b[39m\u001b[92m + LineSearches v7.0.0\u001b[39m\n",
      " \u001b[90m [4345ca2d]\u001b[39m\u001b[92m + Loess v0.5.0\u001b[39m\n",
      " \u001b[90m [e89f7d12]\u001b[39m\u001b[92m + Media v0.4.1\u001b[39m\n",
      " \u001b[90m [d41bc354]\u001b[39m\u001b[92m + NLSolversBase v7.1.0\u001b[39m\n",
      " \u001b[90m [429524aa]\u001b[39m\u001b[92m + Optim v0.17.0\u001b[39m\n",
      " \u001b[90m [90014a1f]\u001b[39m\u001b[92m + PDMats v0.9.5\u001b[39m\n",
      " \u001b[90m [d96e819e]\u001b[39m\u001b[92m + Parameters v0.10.1\u001b[39m\n",
      " \u001b[90m [85a6dd25]\u001b[39m\u001b[92m + PositiveFactorizations v0.2.1\u001b[39m\n",
      " \u001b[90m [1fd47b50]\u001b[39m\u001b[92m + QuadGK v2.0.2\u001b[39m\n",
      " \u001b[90m [79098fc4]\u001b[39m\u001b[92m + Rmath v0.5.0\u001b[39m\n",
      " \u001b[90m [4c63d2b9]\u001b[39m\u001b[92m + StatsFuns v0.7.0\u001b[39m\n",
      " \u001b[90m [ea10d353]\u001b[39m\u001b[92m + WeakRefStrings v0.5.2\u001b[39m\n",
      " \u001b[90m [9fa8497b]\u001b[39m\u001b[92m + Future \u001b[39m\n",
      " \u001b[90m [9abbd945]\u001b[39m\u001b[92m + Profile \u001b[39m\n",
      " \u001b[90m [4607b0f0]\u001b[39m\u001b[92m + SuiteSparse \u001b[39m\n",
      "\u001b[32m\u001b[1m  Building\u001b[22m\u001b[39m Knockout ─────→ `~/.julia/packages/Knockout/JIqpG/deps/build.log`\n",
      "\u001b[32m\u001b[1m  Building\u001b[22m\u001b[39m InteractBase ─→ `~/.julia/packages/InteractBase/Q4IkI/deps/build.log`\n",
      "\u001b[32m\u001b[1m  Building\u001b[22m\u001b[39m InteractBulma → `~/.julia/packages/InteractBulma/Ohu5Y/deps/build.log`\n",
      "\u001b[32m\u001b[1m  Building\u001b[22m\u001b[39m Arpack ───────→ `~/.julia/packages/Arpack/WP3ru/deps/build.log`\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "┌ Info: Precompiling Compose [a81c6b42-2e10-5240-aca2-a61377ecd94b]\n",
      "└ @ Base loading.jl:1186\n",
      "ERROR: LoadError: syntax: try without catch or finally\n",
      "Stacktrace:\n",
      " [1] include at ./boot.jl:317 [inlined]\n",
      " [2] include_relative(::Module, ::String) at ./loading.jl:1038\n",
      " [3] include(::Module, ::String) at ./sysimg.jl:29\n",
      " [4] top-level scope at none:2\n",
      " [5] eval at ./boot.jl:319 [inlined]\n",
      " [6] eval(::Expr) at ./client.jl:389\n",
      " [7] top-level scope at ./none:3\n",
      "in expression starting at /home/mbauman/.julia/packages/Compose/y7cU7/src/Compose.jl:207\n"
     ]
    },
    {
     "ename": "ErrorException",
     "evalue": "Failed to precompile Compose [a81c6b42-2e10-5240-aca2-a61377ecd94b] to /home/mbauman/.julia/compiled/v1.0/Compose/sbiEw.ji.",
     "output_type": "error",
     "traceback": [
      "Failed to precompile Compose [a81c6b42-2e10-5240-aca2-a61377ecd94b] to /home/mbauman/.julia/compiled/v1.0/Compose/sbiEw.ji.",
      "",
      "Stacktrace:",
      " [1] error(::String) at ./error.jl:33",
      " [2] macro expansion at ./logging.jl:313 [inlined]",
      " [3] compilecache(::Base.PkgId, ::String) at ./loading.jl:1184",
      " [4] macro expansion at ./logging.jl:311 [inlined]",
      " [5] _require(::Base.PkgId) at ./loading.jl:941",
      " [6] require(::Base.PkgId) at ./loading.jl:852",
      " [7] macro expansion at ./logging.jl:311 [inlined]",
      " [8] require(::Module, ::Symbol) at ./loading.jl:834",
      " [9] top-level scope at In[1]:2"
     ]
    }
   ],
   "source": [
    "# using Pkg; Pkg.add([\"Compose\", \"Gadfly\", \"Interact\"])\n",
    "using Compose, Gadfly, Interact"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# `reduce()`\n",
    "\n",
    "Reduction applies a binary operator to a vector repeatedly to return a scalar. Thus + becomes sum, and * becomes prod.\n",
    "\n",
    "It is considered a basic parallel computing primitive.\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "reduce(+, 1:8), sum(1:8)  # triangular numbers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "reduce(*, 1:8), prod(1:8) # factorials"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "boring(a,b)=a\n",
    "@show reduce(boring, 1:8)\n",
    "boring2(a,b)=b\n",
    "@show reduce(boring2, 1:8)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "You can also use reduce to compute Fibonacci numbers using their recurrences.\n",
    "\n",
    "$$\\begin{pmatrix} f_2 \\\\f_1 \\end{pmatrix} = \\begin{pmatrix} f_1 + f_0 \\\\ f_1 \\end{pmatrix}\n",
    "= \\begin{pmatrix} 1 & 1 \\\\ 1 & 0 \\end{pmatrix} \\begin{pmatrix} f_1 \\\\ f_0 \\end{pmatrix} $$\n",
    "\n",
    "$$\\begin{pmatrix} f_{n+1} \\\\ f_n \\end{pmatrix} = \\dots\n",
    "= \\begin{pmatrix} 1 & 1 \\\\ 1 & 0 \\end{pmatrix}^n \\begin{pmatrix} f_1 \\\\ f_0 \\end{pmatrix} $$\n",
    "\n",
    "From this, you can show that\n",
    "\n",
    "$$\\begin{pmatrix} 1 & 1 \\\\ 1 & 0 \\end{pmatrix}^n  = \\begin{pmatrix} f_{n+1} & f_n \\\\ f_n & f_{n-1} \\end{pmatrix} $$\n",
    "\n",
    "(this applies reduce to the same argument over and over again -- there are of course other ways)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "M=[1 1; 1 0]\n",
    "reduce(*,fill(M,3))\n",
    "prod(fill(M,3))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "@manipulate for n=1:100\n",
    "    prod(fill(big.(M),n))\n",
    "end"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fib(j)=reduce(*, fill(M,j))\n",
    "fib.([4,7])\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "You can solve recurrences of any complexity using `reduce`. For example, `reduce` can compute a Hadamard matrix from its definition in terms of its submatrices:\n",
    "\n",
    "$$H_2 = \\begin{pmatrix} H_1 & H_1 \\\\ H_1 & -H_1 \\end{pmatrix} = \\begin{pmatrix} 1 & 1 \\\\ 1 & -1 \\end{pmatrix} \\otimes H_1$$\n",
    "\n",
    "and so on.\n",
    "\n",
    "(Note: this is just using reduce to compute a matrix power.\n",
    "One can think of alternative ways for sure.)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# [A B]\n",
    "# If A is m x n\n",
    "# If B is p x q\n",
    "# then kron(A,B) is mp x nq and has all the elements of A times all of the elements of B"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "A=[1 2;3 4]\n",
    "B=[10 100; 1 -10]\n",
    "⊗(A,B)=kron(A,B)\n",
    "\n",
    "M=[ 1 1;1 -1]\n",
    "H=⊗(⊗(⊗(M,M),M),M)\n",
    "H*H'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "Hadamard(n)=reduce(⊗, fill(M,n))\n",
    "H=Hadamard(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "H'H"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "H*H' #This is a legitimate Hadamard matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "using StatsBase"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "rolldice() = rand(1:6)+rand(1:6)\n",
    "rolls(n)=fit(Histogram,[rolldice() for i=1:n ],2:12,closed=:left)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "rolls(1000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "[rolls(100) for i=1:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "reduce(merge,[rolls(100) for i=1:10])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Reduction of matrix multiplications\n",
    "M=[rand(-3:3,2,2) for i=1:4]\n",
    "\n",
    "display(M[4]*M[3]*M[2]*M[1])\n",
    "display(reduce((A,B)->B*A, M)) #backward multiply\n",
    "display(reduce(*, M))          #forward multiply"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "In the following example we apply `reduce()` to  function composition:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "(sin ∘ cos)(1), sin(cos(1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "((-) ∘ sin)(1), -sin(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "( (!) ∘ )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# h=reduce( (f,g) -> (x->f(g(x))), [sin cos tan])\n",
    "h=reduce(∘, [sin cos tan])\n",
    "[h(π) sin(cos(tan(π)))]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cumsum(1:8)  # It is useful to know that cumsum is a linear operator\n",
    "# You can use power method! Below is the underlying matrix\n",
    "A=tril(ones(Int,8,8)) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "[A*(1:8),cumsum(1:8)]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# `prefix`"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Having discussed `reduce`, we are now ready for the idea behind prefix sum.\n",
    "Prefix or scan is long considered an important parallel\n",
    "primitive as well.\n",
    "\n",
    "Suppose you wanted to compute the partial sums of a vector, i.e. given\n",
    "`y[1:n]`, we want to overwrite the vector `y` with the vector of partial sums\n",
    "\n",
    "```julia\n",
    "new_y[1] = y[1]\n",
    "new_y[2] = y[1] + y[2]\n",
    "new_y[3] = y[1] + y[2] + y[3]\n",
    "...\n",
    "```\n",
    "\n",
    "At first blush, it seems impossible to parallelize this, since\n",
    "\n",
    "```julia\n",
    "new_y[1] = y[1]\n",
    "new_y[2] = new_y[1] + y[2]\n",
    "new_y[3] = new_y[2] + y[3]\n",
    "...\n",
    "```\n",
    "\n",
    "which appears to be an intrinsically serial process."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "function prefix_serial!(y, ⊕)\n",
    "    for i=2:length(y)\n",
    "        y[i] = y[i-1] ⊕ y[i]\n",
    "        end\n",
    "    y\n",
    "end"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "prefix_serial!([1:8;],*)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "prefix_serial!([rand(1:5,2,2) for i=1:4],*)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "However, it turns out that because addition (`+`) is associative, we can regroup the _order_ of how these sums are carried out. (This of course extends to other associative operations such as multiplication.) Another ordering of 8 associative operations is provided by `prefix8!`:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Eight only for pedagogy\n",
    "function prefix8!(y, ⋅)\n",
    "    length(y)==8 || error(\"length 8 only\")\n",
    "    for i in [2,4,6,8]; y[i] = y[i-1] ⋅ y[i]; end\n",
    "    for i in [  4,  8]; y[i] = y[i-2] ⋅ y[i]; end\n",
    "    for i in [      8]; y[i] = y[i-4] ⋅ y[i]; end\n",
    "    for i in [    6  ]; y[i] = y[i-2] ⋅ y[i]; end\n",
    "    for i in [ 3,5,7 ]; y[i] = y[i-1] ⋅ y[i]; end\n",
    "    y\n",
    "end\n",
    "\n",
    "# Generalization to any n\n",
    "function prefix!(y, ⋅)\n",
    "    l=length(y)\n",
    "    k=ceil(Int, log2(l))\n",
    "    @inbounds for j=1:k, i=2^j:2^j:min(l, 2^k)              #\"reduce\"\n",
    "        y[i] = y[i-2^(j-1)] ⋅ y[i]\n",
    "    end\n",
    "    @inbounds for j=(k-1):-1:1, i=3*2^(j-1):2^j:min(l, 2^k) #\"broadcast\"\n",
    "        y[i] = y[i-2^(j-1)] ⋅ y[i]\n",
    "    end\n",
    "    y\n",
    "end"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "prefix!([1:12;],*)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Polymorphism for visualization"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We can visualize the operations with a little bit of trickery. In Julia, arrays are simply types that expose the array protocol. In particular, they need to implement  methods for the generic functions `length`, `getindex` and `setindex!`. The last two are used in indexing operations, since statements\n",
    "\n",
    "    y = A[1]\n",
    "    A[3] = y\n",
    "    \n",
    "get desugared to\n",
    "\n",
    "    y = getindex(A, 1)\n",
    "    setindex!(A, y, 3)\n",
    "    \n",
    "respectively.\n",
    "\n",
    "We can trace through the iterable by introduce a dummy array type, `DummyArray`, which stores no useful information but records every access to `getindex` and `setindex!`.\n",
    "\n",
    "Specifically:\n",
    "\n",
    "- `length(A::DummyArray)` returns an integer that is stored internally in the `A.length` field\n",
    "- `getindex(A::DummyArray, i)` records read access to the index `i` in the `A.read` field and always retuns `nothing`.\n",
    "- `setindex!(A::DummyArray, x, i)` records write access to the index `i`. The `A.history` field is appended with a new tuple consisting of the current `A.read` field and the index `i`. \n",
    "\n",
    "The way `DummyArray` works, it assumes an association between a single `setindex!` call and and all the preceding `getindex` calls since the previous `setindex!` call, which is sufficient for the purposes of tracing through prefix calls."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import Base: getindex, setindex!, length\n",
    "\n",
    "isdefined(:DummyArray) || type DummyArray\n",
    "    length :: Int\n",
    "    read :: Vector\n",
    "    history :: Vector{Any}\n",
    "    DummyArray(length, read=[], history=[])=new(length, read, history)\n",
    "end\n",
    "\n",
    "length(A::DummyArray)=A.length\n",
    "\n",
    "function getindex(A::DummyArray, i)\n",
    "    push!(A.read, i)\n",
    "    nothing\n",
    "end\n",
    "\n",
    "function setindex!(A::DummyArray, x, i)\n",
    "    push!(A.history, (A.read, [i]))\n",
    "    A.read = []\n",
    "end"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "M = DummyArray(4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "M[7] = M[3] + M[2]\n",
    "M"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import Base.+\n",
    "+(a::Void, b::Void)=a"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "@manipulate for i=1:100\n",
    "    M = DummyArray(4)\n",
    "    M[i] = M[i-2] + M[2*i]\n",
    "    M\n",
    "end"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We also need a dummy associative operator to pass to the prefix call."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "A=prefix8!(DummyArray(8),+)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "A.history"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now let's visualize this! Each entry in `A.history` is rendered by a gate object:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "ename": "LoadError",
     "evalue": "\u001b[91mUndefVarError: @colorant_str not defined\u001b[39m",
     "output_type": "error",
     "traceback": [
      "\u001b[91mUndefVarError: @colorant_str not defined\u001b[39m",
      ""
     ]
    }
   ],
   "source": [
    "mutable struct gate\n",
    "    ins :: Vector\n",
    "    outs:: Vector\n",
    "end\n",
    "\n",
    "import Gadfly.render\n",
    "\n",
    "function render(G::gate, x₁, y₁, y₀; rᵢ=0.1, rₒ=0.25)\n",
    "    ipoints = [(i, y₀+rᵢ) for i in G.ins]\n",
    "    opoints = [(i, y₀+0.5) for i in G.outs]\n",
    "    igates  = [Compose.circle(i..., rᵢ) for i in ipoints]\n",
    "    ogates  = [Compose.circle(i..., rₒ) for i in opoints]\n",
    "    lines = [line([i, j]) for i in ipoints, j in opoints]\n",
    "    compose(context(units=UnitBox(0.5,0,x₁,y₁+1)),\n",
    "    compose(context(), stroke(colorant\"black\"), fill(colorant\"white\"),\n",
    "            igates..., ogates...),\n",
    "    compose(context(), linewidth(0.3mm), stroke(colorant\"black\"),\n",
    "            lines...))\n",
    "end\n",
    "\n",
    "A=gate([1,2],[2])\n",
    "render(A,2,0,0)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now we render the whole algorithm. We have to scan through the trace twice; the first time merely calculates the maximum depth that needs to be drawn and the second time actually generates the objects."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "function render(A::DummyArray)\n",
    "    #Scan to find maximum depth\n",
    "    olast = depth = 0\n",
    "    for y in A.history\n",
    "        (any(y[1] .≤ olast)) && (depth += 1)\n",
    "        olast = maximum(y[2])\n",
    "    end\n",
    "    maxdepth = depth\n",
    "    \n",
    "    olast = depth = 0\n",
    "    C = []\n",
    "    for y in A.history\n",
    "        (any(y[1] .≤ olast)) && (depth += 1)\n",
    "        push!(C, render(gate(y...), A.length, maxdepth, depth))\n",
    "        olast = maximum(y[2])\n",
    "    end\n",
    "    \n",
    "    push!(C, compose(context(units=UnitBox(0.5,0,A.length,1)),\n",
    "      [line([(i,0), (i,1)]) for i=1:A.length]...,\n",
    "    linewidth(0.1mm), stroke(colorant\"grey\")))\n",
    "    compose(context(), C...)\n",
    "end"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "render(prefix!(DummyArray(8), +))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now we can see that `prefix!` rearranges the operations to form two spanning trees:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "render(prefix!(DummyArray(120),+))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "function prefix!(y, +)\n",
    "    l=length(y)\n",
    "    k=ceil(Int, log2(l))\n",
    "    @inbounds for j=1:k, i=2^j:2^j:min(l, 2^k)              #\"reduce\"\n",
    "        y[i] = y[i-2^(j-1)] + y[i]\n",
    "    end\n",
    "    @inbounds for j=(k-1):-1:1, i=3*2^(j-1):2^j:min(l, 2^k) #\"broadcast\"\n",
    "        y[i] = y[i-2^(j-1)] + y[i]\n",
    "    end\n",
    "    y\n",
    "end\n",
    "@show prefix!(DummyArray(8),+)\n",
    "render(prefix!(DummyArray(9),+))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Look at the compose tree\n",
    "#render(prefix8!(AccessArray(8),+)) |> introspect"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "as contrasted with the serial code:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "render(prefix_serial!(DummyArray(8),+))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "@manipulate for npp=1:180\n",
    "    render(prefix!(DummyArray(npp),+))\n",
    "end"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "@manipulate for npp=1:180\n",
    "    render(prefix_serial!(DummyArray(npp),+))\n",
    "end"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "draw(PDF(\"serial.pdf\", 3.5inch, 3.5inch), render(prefix_serial!(AccessArray(8),+)))\n",
    "draw(PDF(\"tree.pdf\", 3.5inch, 3.5inch), render(prefix8!(AccessArray(8),+)))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Polymorphism for parallel operations"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now let's run `prefix` in parallel on every core on the CPU. Use `addprocs` to attach additional worker processes."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import Base: fetch, length, +, *\n",
    "fetch(t::Vector) = map(fetch, t) #Vectorize fetch\n",
    "\n",
    "#Define elementary operations on remote data\n",
    "length(r1::Future)=length(fetch(r1))\n",
    "+(r1::Future,r2::Future)=@spawnat r2.where fetch(r1)+fetch(r2)\n",
    "*(r1::Future,r2::Future)=@spawnat r2.where fetch(r1)*fetch(r2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "For 8 processes, the serial version requires 7 operations. The parallel version uses 11 operations, but they are grouped into 5 simultaneous chunks of operations. Hence we expect that the parallel version runs in 5/7 the time needed for the naïve serial code."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Before timing, force Julia to JIT compile the functions\n",
    "[f([randn(3,3) for i=1:2], *) for f in (prefix_serial!, prefix!)]\n",
    "\n",
    "# addprocs(max(0, Sys.CPU_CORES-nprocs())) #Add all possible processors\n",
    "using JuliaRunClient\n",
    "# initializeCluster(2)\n",
    "initializeCluster(16) # more workers can be added with a larger subscription\n",
    "\n",
    "timings=Dict{Int,Tuple{Float64,Float64}}()\n",
    "for np in 2:nprocs()\n",
    "    gc_enable(false) #Disable garbage collector for timing purposes\n",
    "    n=1024#4096#3072\n",
    "    \n",
    "    #Create RemoteRefs on np processes\n",
    "    r=[@spawnat p randn(n,n) for p in procs()[1:np]]\n",
    "    s=fetch(r) #These are in-memory matrices\n",
    "\n",
    "    #Repeat timings a couple of times and take smallest\n",
    "    t_ser = minimum([(tic(); prefix_serial!(s, *); toc()) for i=1:2])\n",
    "    t_par = minimum([(tic(); @sync prefix!(r, *) ; toc()) for i=1:2])\n",
    "    timings[np] = (t_ser, t_par)\n",
    "    gc_enable(true)\n",
    "end\n",
    "timings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#timings = #Data from julia.mit.edu\n",
    "#Dict(\n",
    "#     1 => ( 15.585,  15.721),\n",
    "#     2 => ( 16.325,  15.893),\n",
    "#     3 => ( 32.079,  31.678),\n",
    "#     4 => ( 46.037,  33.257),\n",
    "#     5 => ( 61.947,  51.412),\n",
    "#     6 => ( 77.247,  50.769), \n",
    "#     7 => ( 92.588,  67.796),\n",
    "#     8 => (108.057,  65.185),\n",
    "#     9 => (123.556,  68.197),\n",
    "#    10 => (138.408,  66.658),\n",
    "#    11 => (153.392,  82.580),\n",
    "#    12 => (168.874,  83.373),\n",
    "#    13 => (184.227,  82.875),\n",
    "#    14 => (199.741,  83.098),\n",
    "#    15 => (215.881, 101.271),\n",
    "#    16 => (230.531, 104.860),\n",
    "#    17 => (246.576, 103.229),\n",
    "#    18 => (261.412, 102.497),\n",
    "#    19 => (276.926, 103.367),\n",
    "#    20 => (294.404, 104.162),\n",
    "#    21 => (308.995, 104.944),\n",
    "#    22 => (323.287, 104.838),\n",
    "#    23 => (340.173, 121.495),\n",
    "#    24 => (353.717, 119.729),\n",
    "#    25 => (369.113, 120.281),\n",
    "#    26 => (384.638, 118.710),\n",
    "#    27 => (402.484, 119.237),\n",
    "#    28 => (417.036, 119.980),\n",
    "#    29 => (431.273, 120.463),\n",
    "#    30 => (446.288, 120.560),\n",
    "#    31 => (461.756, 137.209),\n",
    "#    32 => (476.653, 135.873),\n",
    "#    33 => (493.989, 138.150),\n",
    "#    34 => (508.089, 137.998),\n",
    "#    35 => (523.703, 139.059),\n",
    "#    36 => (539.305, 135.201),\n",
    "#    37 => (554.201, 138.242),\n",
    "#    38 => (569.276, 137.470),\n",
    "#    39 => (585.268, 137.261),\n",
    "#    40 => (599.921, 137.871),\n",
    "#    41 => (615.718, 137.887),\n",
    "#    42 => (632.463, 137.579),\n",
    "#    43 => (647.355, 138.181),\n",
    "#    44 => (664.344, 139.917),\n",
    "#    45 => (678.899, 138.382),\n",
    "#    46 => (695.815, 138.354),\n",
    "#    47 => (717.068, 154.826),\n",
    "#    48 => (730.902, 155.603),\n",
    "#    49 => (745.608, 154.549),\n",
    "#    50 => (756.012, 154.244),\n",
    "#    51 => (769.611, 154.000),\n",
    "#    52 => (793.350, 152.893),\n",
    "#    53 => (801.568, 155.276),\n",
    "#    54 => (818.174, 153.436),\n",
    "#    55 => (832.994, 155.923),\n",
    "#    56 => (849.526, 155.721),\n",
    "#    57 => (863.337, 155.289),\n",
    "#    58 => (881.577, 156.188),\n",
    "#    59 => (893.735, 157.889),\n",
    "#    60 => (911.597, 155.557),\n",
    "#    61 => (925.117, 157.048),\n",
    "#    62 => (940.897, 153.338),\n",
    "#    63 => (954.825, 173.942),\n",
    "#    64 => (990.757, 173.236),\n",
    "#    65 => (987.457, 172.005),\n",
    "#    66 => (1009.688, 173.869),\n",
    "#    67 => (1022.357, 173.303),\n",
    "#    68 => (1043.536, 175.044),\n",
    "#    69 => (1052.156, 173.361),\n",
    "#    70 => (1067.174, 172.230),\n",
    "#    71 => (1079.543, 173.491),\n",
    "#    72 => (1101.516, 174.311),\n",
    "#    73 => (1119.993, 174.057),\n",
    "#    74 => (1136.092, 175.804),\n",
    "#    75 => (1154.758, 175.075),\n",
    "#    76 => (1170.732, 173.453),\n",
    "#    77 => (1184.598, 171.936),\n",
    "#    78 => (1190.922, 172.809),\n",
    "#    79 => (1204.645, 172.878),\n",
    "#)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "speedup(p::Integer) = ((p-1)/(floor(Int, log2(p)) + 1 + floor(Int, log2(p/3))))\n",
    "#for (np, (t_ser, t_par)) in sort!([x for x in timings])\n",
    "#    @printf(\"np: %3d Serial: %.3f sec  Parallel: %.3f sec  speedup: %.3fx (theory=%.3fx)\\n\",\n",
    "#        np, t_ser, t_par, t_ser/t_par, speedup(np))\n",
    "#end\n",
    "\n",
    "xend = 1+maximum(keys(timings))\n",
    "\n",
    "theplot=plot(layer(x=2:xend, y=map(speedup, 2:xend), Geom.line(), Theme(default_color=colorant\"darkkhaki\")),\n",
    "layer(x=[np+1 for (np, (t_ser, t_par)) in timings],\n",
    "      y=[t_ser/t_par for (np, (t_ser, t_par)) in timings],\n",
    "    Geom.point, Theme(default_color=colorant\"firebrick\",\n",
    "        #=default_point_size=1.25pt, =#highlight_width=0pt)),\n",
    "    Guide.xlabel(\"Number of processors, p\"), Guide.ylabel(\"Speedup, r\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "labeledplot = compose(render(theplot),\n",
    "compose(context(), stroke(colorant\"purple\"), #Powers of 2\n",
    "text(0.21, 0.73, \"2\"), text(0.24, 0.72, \"4\"), text(0.27, 0.7, \"8\"), text(0.33, 0.63, \"16\"),\n",
    "text(0.47, 0.53, \"32\"), text(0.76, 0.35, \"64\")),\n",
    "compose(context(), stroke(colorant\"green\"), #3x powers of 2\n",
    "text(0.25, 0.62, \"6\"), text(0.28, 0.57, \"12\"), text(0.39, 0.49, \"24\"), text(0.63, 0.32, \"48\"),\n",
    ")\n",
    ")\n",
    "draw(SVG(3.5inch, 3.2inch), labeledplot)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "draw(SVG(\"scaling.svg\", 3.5inch, 3.2inch), labeledplot)\n",
    "#draw(PDF(\"scaling.pdf\", 3.5inch, 3inch), theplot)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The exact same serial code now runs in parallel thanks to multiple dispatch!"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Formal verification\n",
    "\n",
    "Julia allows us to implement very easily the interval of summations monoid technique for verifying the correctness of prefix sums ([doi:10.1145/2535838.2535882](http://dx.doi.org/10.1145/2535838.2535882))."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Definition 4.3 of Chong et al., 2014\n",
    "abstract type ⊤ end\n",
    "abstract type Id end\n",
    "const 𝕊 = Union{UnitRange, Type{⊤}, Type{Id}}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "⊕(I::UnitRange, J::UnitRange) = I.stop+1==J.start ? (I.start:J.stop) : ⊤\n",
    "⊕(::Type{Id}, ::Type{Id}) = Id\n",
    "⊕(I::𝕊, ::Type{Id}) = I\n",
    "⊕(::Type{Id}, I::𝕊) = I\n",
    "⊕(I::𝕊, J::𝕊) = ⊤"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for kernel in (prefix_serial!, prefix!), n=1:1000\n",
    "    @assert kernel([k:k for k=1:n], ⊕) == [1:k for k=1:n]\n",
    "end"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Wrong kernels produce type errors!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "function prefix_wrong!(y, .+)\n",
    "    l = length(y)\n",
    "    offset = 1\n",
    "    z = copy(y)\n",
    "    while offset < l\n",
    "        for i=offset+1:l\n",
    "            z[i] = y[i-offset] .+ z[i]\n",
    "        end\n",
    "        offset += 2\n",
    "        y = copy(z)\n",
    "    end\n",
    "    y\n",
    "end\n",
    "\n",
    "prefix_wrong!(Any[k:k for k=1:8], ⊕) #Should produce conversion error"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "@manipulate for npp=1:10\n",
    "    render(prefix_wrong!(AccessArray(npp),+))\n",
    "end"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Experimental code which doesn't work\n",
    "\n",
    "In this section I play with other parallel prefix algorithms. All the ones here suffer from read antidependencies and they break the logic I used to draw the gates."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Kogge-Stone variant\n",
    "#Works but doesn't render properly\n",
    "function prefix_ks!(y, .+)\n",
    "    l = length(y)\n",
    "    offset = 1\n",
    "    z = copy(y)\n",
    "    while offset < l\n",
    "        for i=offset+1:l\n",
    "            z[i] = y[i-offset] .+ z[i]\n",
    "        end\n",
    "        offset *= 2\n",
    "        y = copy(z)\n",
    "    end\n",
    "    y\n",
    "end\n",
    "\n",
    "@assert prefix_ks!(ones(8), +) == [1:8;]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "Base.copy(A::AccessArray) = A\n",
    "\n",
    "render(prefix_ks!(AccessArray(8),+))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Hillis-Steele = Kogge-Stone\n",
    "function prefix_hs!(y, +)\n",
    "    l = length(y)\n",
    "    lk = floor(Int, log2(l))\n",
    "    for d=1:lk\n",
    "        for k=2d:l\n",
    "            y[k] = y[k-2^(d-1)] + y[k]\n",
    "        end\n",
    "    end\n",
    "    y\n",
    "end\n",
    "prefix_hs!(ones(4), +)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "render(prefix_hs!(AccessArray(4), +))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Sklansky\n",
    "function prefix_s!(y, +)\n",
    "    n = length(y)\n",
    "    for d=1:log2(n)\n",
    "        for k=1:n÷2+1\n",
    "            block = 2 * (k - (mod(k, 2^d)))\n",
    "            me = block + mod(k, 2^d) + 2^d\n",
    "            spine = block + 2^d - 1\n",
    "            if spine ≤ n && me ≤ n\n",
    "                y[me] = y[me] + y[spine]\n",
    "            end\n",
    "        end\n",
    "    end\n",
    "    y\n",
    "end\n",
    "prefix_s!(ones(4), +)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "function prefix(v,⊕)\n",
    "    @show v\n",
    "    k=length(v)\n",
    "    k≤1 && return v\n",
    "    v[2:2:k] = @show v[1:2:k-1]⊕v[2:2:k]  # Line 1\n",
    "    v[2:2:k] = @show prefix(v[2:2:k],⊕)   # Line 2\n",
    "    v[3:2:k] = @show v[2:2:k-1]⊕v[3:2:k]  # Line 3\n",
    "    v\n",
    "end"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "prefix(ones(8),+)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "function ⊕{T<:Range}(v1::Vector{T}, v2::Vector{T})\n",
    "    for v in (v1, v2), i in 1:length(v)-1\n",
    "        @assert v[i].stop + 1 == v[i+1].start\n",
    "    end\n",
    "    if length(v1)>0 && length(v2)>0\n",
    "        @assert v1[end].stop + 1 == v2[1].start\n",
    "        v1[1].start:v2[end].stop\n",
    "    elseif length(v1)>0 #&& length(v2)==0\n",
    "        v1[1].start:v1[end].stop\n",
    "    elseif length(v2)>0 #&& length(v1)==0\n",
    "        v2[1].start:v2[end].stop\n",
    "    else #Both are empty\n",
    "        v1\n",
    "    end\n",
    "end"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for kernel in (prefix,), n=1:100#(prefix_serial!, prefix!, prefix_ks!), n=1:100\n",
    "    @assert kernel([k:k for k=1:n], ⊕) == [1:k for k=1:n]\n",
    "end"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Julia 1.0.0",
   "language": "julia",
   "name": "julia-1.0"
  },
  "language_info": {
   "file_extension": ".jl",
   "mimetype": "application/julia",
   "name": "julia",
   "version": "1.0.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
